Algorytm K-Nearest Neighbors (KNN)

W ramach rego skryptu zastaną przetestowane możliwości algorytmu KNN w zakresie klasyfikacji i regresji. Celem będzie:

  • klasyfikacja: clicks_per_view = 0 vs. clicks_per_view > 0

  • regresja: przewidywanie wskaźnika clicks_per_view

  • kombinacja obu metod: przewidywanie, czy clicks_per_view > 0, a następnie regresja dokładnej wartości wskaźnika

    Ze względu na duży rozmiar danych (~30 000 wierszy) w ramach analizy wykorzystany zostanie algorytm Fast KNN (fknn).

dane <- read.csv("data/wikipedia.csv", row.names = 1)
head(subset(dane, select = -summary))
##                     title word_count log_word_count
## 0             mathematics       8055       8.994172
## 1 language of mathematics        797       6.682109
## 2                  abacus       3991       8.292048
## 3            roman abacus       2018       7.610358
## 4            tlalcuahuitl        152       5.030438
## 5  babylonian mathematics       1975       7.588830
##                                                                                                                                                                                                                                                                                                                                                                                                                                                image_titles
## 0                                                                                       ['arithmetic symbols.svg', 'bakhshali numerals 2.jpg', 'carl friedrich gauss 1840 by jensen.jpg', 'cauchy sequence illustration.svg', 'commons-logo.svg', 'fieldsmedalfront.jpg', 'giant pufferfish skin pattern detail.jpg', 'godfreykneller-isaacnewton-1689.jpg', 'gottfried wilhelm leibniz, bernhard christoph francke.jpg', 'illustrationcentraltheorem.png']
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                 ['question book-new.svg']
## 2                                                                                                                                                                        ['1543 robert recorde.png', 'abacus (psf).png', 'abacus 6.png', 'adam riesen.jpg', 'bbinary abacus 002.jpg', 'commons-logo.svg', 'houghton typ 520.03.736 - margarita philosophica.jpg', 'kugleramme.jpg', 'köbel böschenteyn 1514.jpg', 'nuvola apps edu mathematics blue-p.svg']
## 3                                                                                                                                                                                         ['abacus usages.jpg', 'ambox important.svg', 'commons-logo.svg', 'nuvola apps edu mathematics blue-p.svg', 'people icon.svg', 'romanabacusrecon.jpg', 'roman tablet employed in making arithmetical calculations (14781129921).jpg', 'symbol category class.svg']
## 4                                                                                                                                                                                   ['cemacolli one third tlalcuahuitl aztec glyph.png', 'cemmatl thee fifths of a tlalcuahuitl.png', 'cemmitl one half of tlalcuahuitl aztec glyph.png', 'cemomitl one fifth of tlalcuahutil aztec glyph.png', 'cenyollotli two fifths of a tlalcuahuitl aztec glyph.png']
## 5 ['asia (orthographic projection).svg', 'clay tablet, mathematical, geometric-algebraic, similar to the euclidean geometry. from tell harmal, iraq. 2003-1595 bce. iraq museum.jpg\\', "file:clay tablet, mathematical, geometric-algebraic, similar to the pythagorean theorem. from tell al-dhabba\\'i, iraq. 2003-1595 bce. iraq museum.jpg", \\'file:nuvola apps edu mathematics blue-p.svg', 'symbol category class.svg', 'ybc-7289-obv-labeled.jpg']
##                                                                                                                                                                                                                                                                                                                                                                                                                         image_titles_string
## 0                                                                                             arithmetic symbols.svg, bakhshali numerals 2.jpg, carl friedrich gauss 1840 by jensen.jpg, cauchy sequence illustration.svg, commons-logo.svg, fieldsmedalfront.jpg, giant pufferfish skin pattern detail.jpg, godfreykneller-isaacnewton-1689.jpg, gottfried wilhelm leibniz, bernhard christoph francke.jpg, illustrationcentraltheorem.png
## 1                                                                                                                                                                                                                                                                                                                                                                                                                     question book-new.svg
## 2                                                                                                                                                                              1543 robert recorde.png, abacus (psf).png, abacus 6.png, adam riesen.jpg, bbinary abacus 002.jpg, commons-logo.svg, houghton typ 520.03.736 - margarita philosophica.jpg, kugleramme.jpg, köbel böschenteyn 1514.jpg, nuvola apps edu mathematics blue-p.svg
## 3                                                                                                                                                                                           abacus usages.jpg, ambox important.svg, commons-logo.svg, nuvola apps edu mathematics blue-p.svg, people icon.svg, romanabacusrecon.jpg, roman tablet employed in making arithmetical calculations (14781129921).jpg, symbol category class.svg
## 4                                                                                                                                                                               cemacolli one third tlalcuahuitl aztec glyph.png, cemmatl thee fifths of a tlalcuahuitl.png, cemmitl one half of tlalcuahuitl aztec glyph.png, cemomitl one fifth of tlalcuahutil aztec glyph.png, cenyollotli two fifths of a tlalcuahuitl aztec glyph.png
## 5 asia (orthographic projection).svg, clay tablet, mathematical, geometric-algebraic, similar to the euclidean geometry. from tell harmal, iraq. 2003-1595 bce. iraq museum.jpg', "file:clay tablet, mathematical, geometric-algebraic, similar to the pythagorean theorem. from tell al-dhabba'i, iraq. 2003-1595 bce. iraq museum.jpg", 'file:nuvola apps edu mathematics blue-p.svg, symbol category class.svg, ybc-7289-obv-labeled.jpg
##   num_images log_num_images mo_page_views log_mo_page_views clicks_in
## 0         10      2.3978953        160941         11.988799    117924
## 1          1      0.6931472          3135          8.050703      3967
## 2         10      2.3978953         44004         10.692059     42122
## 3          8      2.1972246          2568          7.851272      1608
## 4          5      1.7917595           146          4.990433        51
## 5          6      1.9459101          8715          9.072916      5874
##   log_clicks_in clicks_out log_clicks_out clicks_per_view log_clicks_per_view
## 0     11.677804      54025      10.897221      0.33568202          0.28944204
## 1      8.286017        540       6.293419      0.17224880          0.15892396
## 2     10.648349       4532       8.419139      0.10299064          0.09802525
## 3      7.383368        155       5.049856      0.06035826          0.05860683
## 4      3.951244          0       0.000000      0.00000000          0.00000000
## 5      8.678461       1268       7.145984      0.14549627          0.13583797
##   is_mo_page_views_zero is_clicks_in_zero is_clicks_out_zero
## 0                 False             False              False
## 1                 False             False              False
## 2                 False             False              False
## 3                 False             False              False
## 4                 False             False               True
## 5                 False             False              False
##   is_clicks_per_view_zero        type
## 0                   False mathematics
## 1                   False mathematics
## 2                   False mathematics
## 3                   False mathematics
## 4                    True mathematics
## 5                   False mathematics
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            categories
## 0                                                                                                                                                                                                                                                                  ['all wikipedia articles written in american english', 'all articles with dead external links', 'all articles with failed verification', 'articles containing ancient greek (to 1453)-language text', 'articles containing greek-language text', 'articles containing latin-language text', 'articles with dead external links from october 2025', 'articles with failed verification from october 2024', 'articles with short description', 'cs1 german-language sources (de)', 'cs1 errors: isbn date', 'cs1 errors: periodical ignored', 'formal sciences', 'main topic articles', 'mathematics', 'pages using multiple image with manual scaled images', 'pages using sidebar with the child parameter', 'short description is different from wikidata', 'use american english from august 2022', 'use mdy dates from october 2024', 'webarchive template archiveis links', 'webarchive template wayback links', 'wikipedia indefinitely move-protected pages', 'wikipedia indefinitely semi-protected pages']
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               ['all articles needing additional references', 'articles needing additional references from june 2022', 'articles with short description', 'language', 'mathematics', 'short description is different from wikidata']
## 2 ['abacus', 'all wikipedia articles written in american english', 'all articles with unsourced statements', 'ancient roman mathematics', 'articles containing ancient greek (to 1453)-language text', 'articles containing chinese-language text', 'articles containing japanese-language text', 'articles containing latin-language text', 'articles containing russian-language text', 'articles with short description', 'articles with unsourced statements from april 2024', 'cs1: long volume value', 'cs1 chinese-language sources (zh)', 'cs1 greek-language sources (el)', 'cs1 korean-language sources (ko)', 'cs1 latin-language sources (la)', 'cs1 spanish-language sources (es)', 'cs1 errors: isbn date', 'cs1 uses korean-language script (ko)', 'chinese mathematics', 'commons link from wikidata', 'egyptian mathematics', 'greek mathematics', 'indian mathematics', 'japanese mathematics', 'korean mathematics', 'mathematical tools', 'pages with nahuatl languages ipa', 'short description matches wikidata', 'use american english from may 2021', 'use mdy dates from june 2013', 'wikipedia articles incorporating a citation from eb9', 'wikipedia articles incorporating a citation from the 1911 encyclopaedia britannica with wikisource reference']
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                ['abacus', 'all articles that may contain original research', 'all articles with unsourced statements', 'ancient roman mathematics', 'ancient roman technology', 'articles that may contain original research from march 2024', 'articles with short description', 'articles with unsourced statements from march 2022', 'cs1: unfit url', 'cs1 german-language sources (de)', 'short description matches wikidata']
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               ['articles with short description', 'aztec mathematics', 'pages with nahuatl languages ipa', 'short description matches wikidata', 'units of length']
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 ['articles with short description', 'babylonian mathematics', 'cs1 maint: multiple names: authors list', 'mathematics of ancient history', 'short description is different from wikidata', 'use dmy dates from february 2024', 'webarchive template wayback links']
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 categories_string
## 0                                                                                                                                                                                                                                                all wikipedia articles written in american english, all articles with dead external links, all articles with failed verification, articles containing ancient greek (to 1453)-language text, articles containing greek-language text, articles containing latin-language text, articles with dead external links from october 2025, articles with failed verification from october 2024, articles with short description, cs1 german-language sources (de), cs1 errors: isbn date, cs1 errors: periodical ignored, formal sciences, main topic articles, mathematics, pages using multiple image with manual scaled images, pages using sidebar with the child parameter, short description is different from wikidata, use american english from august 2022, use mdy dates from october 2024, webarchive template archiveis links, webarchive template wayback links, wikipedia indefinitely move-protected pages, wikipedia indefinitely semi-protected pages
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         all articles needing additional references, articles needing additional references from june 2022, articles with short description, language, mathematics, short description is different from wikidata
## 2 abacus, all wikipedia articles written in american english, all articles with unsourced statements, ancient roman mathematics, articles containing ancient greek (to 1453)-language text, articles containing chinese-language text, articles containing japanese-language text, articles containing latin-language text, articles containing russian-language text, articles with short description, articles with unsourced statements from april 2024, cs1: long volume value, cs1 chinese-language sources (zh), cs1 greek-language sources (el), cs1 korean-language sources (ko), cs1 latin-language sources (la), cs1 spanish-language sources (es), cs1 errors: isbn date, cs1 uses korean-language script (ko), chinese mathematics, commons link from wikidata, egyptian mathematics, greek mathematics, indian mathematics, japanese mathematics, korean mathematics, mathematical tools, pages with nahuatl languages ipa, short description matches wikidata, use american english from may 2021, use mdy dates from june 2013, wikipedia articles incorporating a citation from eb9, wikipedia articles incorporating a citation from the 1911 encyclopaedia britannica with wikisource reference
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    abacus, all articles that may contain original research, all articles with unsourced statements, ancient roman mathematics, ancient roman technology, articles that may contain original research from march 2024, articles with short description, articles with unsourced statements from march 2022, cs1: unfit url, cs1 german-language sources (de), short description matches wikidata
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       articles with short description, aztec mathematics, pages with nahuatl languages ipa, short description matches wikidata, units of length
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             articles with short description, babylonian mathematics, cs1 maint: multiple names: authors list, mathematics of ancient history, short description is different from wikidata, use dmy dates from february 2024, webarchive template wayback links
##   num_categories log_num_categories num_links_internal log_num_links_internal
## 0             24           3.218876                500               6.216606
## 1              6           1.945910                 83               4.430817
## 2             33           3.526361                235               5.463832
## 3             11           2.484907                 92               4.532599
## 4              5           1.791759                 11               2.484907
## 5              7           2.079442                261               5.568345
##   num_editors log_num_editors num_edits log_num_edits        creation_date
## 0         284        5.652489       500      6.216606 2001-11-08t15:31:38z
## 1         170        5.141664       308      5.733341 2003-11-20t14:31:23z
## 2         279        5.634790       500      6.216606 2001-11-02t14:55:45z
## 3         158        5.068904       339      5.828946 2004-08-31t15:44:24z
## 4           9        2.302585        52      3.970292 2012-09-08t08:06:32z
## 5         276        5.624018       500      6.216606 2006-01-27t06:23:38z
##   creation_date_timestamp log_creation_date_timestamp links_per_word
## 0              1005233498                    20.72849              0
## 1              1069338683                    20.79031              0
## 2              1004712945                    20.72797              0
## 3              1093967064                    20.81308              0
## 4              1347091592                    21.02121              0
## 5              1138343018                    20.85284              0
##   log_links_per_word cat_mathematics_related_lists cat_archaeological_artifacts
## 0                  0                             0                            0
## 1                  0                             0                            0
## 2                  0                             0                            0
## 3                  0                             0                            0
## 4                  0                             0                            0
## 5                  0                             0                            0
##   cat_number_theory cat_quantum_mechanics cat_dynamical_systems
## 0                 0                     0                     0
## 1                 0                     0                     0
## 2                 0                     0                     0
## 3                 0                     0                     0
## 4                 0                     0                     0
## 5                 0                     0                     0
##   cat_20th_century_american_mathematicians cat_measurement cat_astrophysics
## 0                                        0               0                0
## 1                                        0               0                0
## 2                                        0               0                0
## 3                                        0               0                0
## 4                                        0               0                0
## 5                                        0               0                0
##   cat_21st_century_american_mathematicians cat_pseudohistory
## 0                                        0                 0
## 1                                        0                 0
## 2                                        0                 0
## 3                                        0                 0
## 4                                        0                 0
## 5                                        0                 0
##   cat_creators_of_writing_systems cat_historiography cat_historical_eras
## 0                               0                  0                   0
## 1                               0                  0                   0
## 2                               0                  0                   0
## 3                               0                  0                   0
## 4                               0                  0                   0
## 5                               0                  0                   0
##   cat_thermodynamics cat_nuclear_physics cat_death_conspiracy_theories
## 0                  0                   0                             0
## 1                  0                   0                             0
## 2                  0                   0                             0
## 3                  0                   0                             0
## 4                  0                   0                             0
## 5                  0                   0                             0
##   cat_topology cat_cryptography cat_philosophers_of_history
## 0            0                0                           0
## 1            0                0                           0
## 2            0                0                           0
## 3            0                0                           0
## 4            0                0                           0
## 5            0                0                           0
##   cat_mathematical_logic cat_condensed_matter_physics
## 0                      0                            0
## 1                      0                            0
## 2                      0                            0
## 3                      0                            0
## 4                      0                            0
## 5                      0                            0
##   cat_fellows_of_the_american_physical_society cat_physical_quantities
## 0                                            0                       0
## 1                                            0                       0
## 2                                            0                       0
## 3                                            0                       0
## 4                                            0                       0
## 5                                            0                       0
##   cat_living_people cat_mechanics cat_destroyed_populated_places
## 0                 0             0                              0
## 1                 0             0                              0
## 2                 0             0                              0
## 3                 0             0                              0
## 4                 0             0                              0
## 5                 0             0                              0
##   cat_pseudohistorians cat_fellows_of_the_american_mathematical_society
## 0                    0                                                0
## 1                    0                                                0
## 2                    0                                                0
## 3                    0                                                0
## 4                    0                                                0
## 5                    0                                                0
nrow(dane)
## [1] 28627

Train-test split

library(dplyr) 
## Warning: pakiet 'dplyr' został zbudowany w wersji R 4.4.3
## 
## Dołączanie pakietu: 'dplyr'
## Następujące obiekty zostały zakryte z 'package:stats':
## 
##     filter, lag
## Następujące obiekty zostały zakryte z 'package:base':
## 
##     intersect, setdiff, setequal, union
set.seed(123) 
train <- dane %>% sample_frac(0.8) 
test <- anti_join(dane, train)
## Joining with `by = join_by(title, summary, word_count, log_word_count,
## image_titles, image_titles_string, num_images, log_num_images, mo_page_views,
## log_mo_page_views, clicks_in, log_clicks_in, clicks_out, log_clicks_out,
## clicks_per_view, log_clicks_per_view, is_mo_page_views_zero, is_clicks_in_zero,
## is_clicks_out_zero, is_clicks_per_view_zero, type, categories,
## categories_string, num_categories, log_num_categories, num_links_internal,
## log_num_links_internal, num_editors, log_num_editors, num_edits, log_num_edits,
## creation_date, creation_date_timestamp, log_creation_date_timestamp,
## links_per_word, log_links_per_word, cat_mathematics_related_lists,
## cat_archaeological_artifacts, cat_number_theory, cat_quantum_mechanics,
## cat_dynamical_systems, cat_20th_century_american_mathematicians,
## cat_measurement, cat_astrophysics, cat_21st_century_american_mathematicians,
## cat_pseudohistory, cat_creators_of_writing_systems, cat_historiography,
## cat_historical_eras, cat_thermodynamics, cat_nuclear_physics,
## cat_death_conspiracy_theories, cat_topology, cat_cryptography,
## cat_philosophers_of_history, cat_mathematical_logic,
## cat_condensed_matter_physics, cat_fellows_of_the_american_physical_society,
## cat_physical_quantities, cat_living_people, cat_mechanics,
## cat_destroyed_populated_places, cat_pseudohistorians,
## cat_fellows_of_the_american_mathematical_society)`
rm(dane)
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells   955798  51.1    1901632 101.6  1379847  73.7
## Vcells 44161847 337.0   96768958 738.3 94009182 717.3

Feature selection

library(gt)
## Warning: pakiet 'gt' został zbudowany w wersji R 4.4.3
library(purrr)
## Warning: pakiet 'purrr' został zbudowany w wersji R 4.4.2
library(tibble) 
train %>% select(where(is.numeric))%>% 
  map_df(~ as.list(summary(.x))) %>% 
  mutate(variable = names(select(train, where(is.numeric)))) %>% 
  relocate(variable) %>% gt()
variable Min. 1st Qu. Median Mean 3rd Qu. Max.
word_count 1.000000e+00 2.022500e+02 4.830000e+02 1.244131e+03 1.351000e+03 4.167400e+04
log_word_count 6.931472e-01 5.314434e+00 6.182085e+00 6.278052e+00 7.209340e+00 1.063766e+01
num_images 0.000000e+00 1.000000e+00 2.000000e+00 3.300585e+00 5.000000e+00 1.000000e+01
log_num_images 0.000000e+00 6.931472e-01 1.098612e+00 1.223615e+00 1.791759e+00 2.397895e+00
mo_page_views 0.000000e+00 1.840000e+02 4.540000e+02 3.337458e+03 1.630500e+03 2.394312e+06
log_mo_page_views 0.000000e+00 5.220356e+00 6.120297e+00 6.415918e+00 7.397255e+00 1.468861e+01
clicks_in 0.000000e+00 4.300000e+01 1.880000e+02 2.593061e+03 9.867500e+02 7.274890e+05
log_clicks_in 0.000000e+00 3.784190e+00 5.241747e+00 5.199044e+00 6.895430e+00 1.349736e+01
clicks_out 0.000000e+00 0.000000e+00 1.300000e+01 7.503346e+02 1.800000e+02 3.190050e+05
log_clicks_out 0.000000e+00 0.000000e+00 2.639057e+00 2.816838e+00 5.198497e+00 1.267297e+01
clicks_per_view 0.000000e+00 0.000000e+00 2.727443e-02 8.082203e-02 1.192622e-01 9.974582e-01
log_clicks_per_view 0.000000e+00 0.000000e+00 2.690911e-02 7.193058e-02 1.126698e-01 6.918755e-01
num_categories 1.000000e+00 6.000000e+00 9.000000e+00 1.132224e+01 1.400000e+01 2.200000e+02
log_num_categories 6.931472e-01 1.945910e+00 2.302585e+00 2.338079e+00 2.708050e+00 5.398163e+00
num_links_internal 1.000000e+00 2.100000e+01 4.400000e+01 1.131882e+02 1.510000e+02 5.000000e+02
log_num_links_internal 6.931472e-01 3.091042e+00 3.806662e+00 4.061465e+00 5.023881e+00 6.216606e+00
num_editors 1.000000e+00 1.800000e+01 3.400000e+01 6.916571e+01 8.300000e+01 3.730000e+02
log_num_editors 6.931472e-01 2.944439e+00 3.555348e+00 3.705627e+00 4.430817e+00 5.924256e+00
num_edits 1.000000e+00 3.100000e+01 6.300000e+01 1.379969e+02 1.750000e+02 5.000000e+02
log_num_edits 6.931472e-01 3.465736e+00 4.158883e+00 4.322748e+00 5.170484e+00 6.216606e+00
creation_date_timestamp 9.797972e+08 1.124904e+09 1.212924e+09 1.271335e+09 1.403138e+09 1.727716e+09
log_creation_date_timestamp 2.070286e+01 2.084096e+01 2.091630e+01 2.095263e+01 2.106198e+01 2.127007e+01
links_per_word 0.000000e+00 0.000000e+00 0.000000e+00 1.371059e-01 0.000000e+00 1.800000e+01
log_links_per_word 0.000000e+00 0.000000e+00 0.000000e+00 5.853933e-02 0.000000e+00 2.944439e+00
cat_mathematics_related_lists 0.000000e+00 0.000000e+00 0.000000e+00 7.379268e-03 0.000000e+00 1.000000e+00
cat_archaeological_artifacts 0.000000e+00 0.000000e+00 0.000000e+00 6.156668e-03 0.000000e+00 1.000000e+00
cat_number_theory 0.000000e+00 0.000000e+00 0.000000e+00 7.772247e-03 0.000000e+00 1.000000e+00
cat_quantum_mechanics 0.000000e+00 0.000000e+00 0.000000e+00 1.135272e-02 0.000000e+00 1.000000e+00
cat_dynamical_systems 0.000000e+00 0.000000e+00 0.000000e+00 9.562484e-03 0.000000e+00 1.000000e+00
cat_20th_century_american_mathematicians 0.000000e+00 0.000000e+00 0.000000e+00 2.558728e-02 0.000000e+00 1.000000e+00
cat_measurement 0.000000e+00 0.000000e+00 0.000000e+00 6.549646e-03 0.000000e+00 1.000000e+00
cat_astrophysics 0.000000e+00 0.000000e+00 0.000000e+00 7.073618e-03 0.000000e+00 1.000000e+00
cat_21st_century_american_mathematicians 0.000000e+00 0.000000e+00 0.000000e+00 2.737752e-02 0.000000e+00 1.000000e+00
cat_pseudohistory 0.000000e+00 0.000000e+00 0.000000e+00 7.248275e-03 0.000000e+00 1.000000e+00
cat_creators_of_writing_systems 0.000000e+00 0.000000e+00 0.000000e+00 4.410095e-03 0.000000e+00 1.000000e+00
cat_historiography 0.000000e+00 0.000000e+00 0.000000e+00 4.977731e-03 0.000000e+00 1.000000e+00
cat_historical_eras 0.000000e+00 0.000000e+00 0.000000e+00 4.017116e-03 0.000000e+00 1.000000e+00
cat_thermodynamics 0.000000e+00 0.000000e+00 0.000000e+00 8.427212e-03 0.000000e+00 1.000000e+00
cat_nuclear_physics 0.000000e+00 0.000000e+00 0.000000e+00 8.558205e-03 0.000000e+00 1.000000e+00
cat_death_conspiracy_theories 0.000000e+00 0.000000e+00 0.000000e+00 5.501703e-03 0.000000e+00 1.000000e+00
cat_topology 0.000000e+00 0.000000e+00 0.000000e+00 8.863855e-03 0.000000e+00 1.000000e+00
cat_cryptography 0.000000e+00 0.000000e+00 0.000000e+00 7.553925e-03 0.000000e+00 1.000000e+00
cat_philosophers_of_history 0.000000e+00 0.000000e+00 0.000000e+00 4.628417e-03 0.000000e+00 1.000000e+00
cat_mathematical_logic 0.000000e+00 0.000000e+00 0.000000e+00 7.117282e-03 0.000000e+00 1.000000e+00
cat_condensed_matter_physics 0.000000e+00 0.000000e+00 0.000000e+00 1.047943e-02 0.000000e+00 1.000000e+00
cat_fellows_of_the_american_physical_society 0.000000e+00 0.000000e+00 0.000000e+00 5.763689e-03 0.000000e+00 1.000000e+00
cat_physical_quantities 0.000000e+00 0.000000e+00 0.000000e+00 8.296219e-03 0.000000e+00 1.000000e+00
cat_living_people 0.000000e+00 0.000000e+00 0.000000e+00 7.916339e-02 0.000000e+00 1.000000e+00
cat_mechanics 0.000000e+00 0.000000e+00 0.000000e+00 5.938346e-03 0.000000e+00 1.000000e+00
cat_destroyed_populated_places 0.000000e+00 0.000000e+00 0.000000e+00 5.545367e-03 0.000000e+00 1.000000e+00
cat_pseudohistorians 0.000000e+00 0.000000e+00 0.000000e+00 5.589032e-03 0.000000e+00 1.000000e+00
cat_fellows_of_the_american_mathematical_society 0.000000e+00 0.000000e+00 0.000000e+00 4.266003e-02 0.000000e+00 1.000000e+00
library(ggplot2)
## Warning: pakiet 'ggplot2' został zbudowany w wersji R 4.4.3
library(stringr)


num_cols <- train %>%
  select(where(is.numeric)) %>%
  names()

cols_with_log <- num_cols[
  paste0("log_", num_cols) %in% names(train)
]


cols_with_log <- setdiff(cols_with_log, "log_clicks_per_view")


plot_for_col <- function(col) {
  
  log_col <- paste0("log_", col)
  
  p1 <- ggplot(train, aes_string(x = col)) +
    geom_histogram(bins = 40, fill = "steelblue", alpha = 0.7) +
    ggtitle(paste("Histogram:", col))
  
  p2 <- ggplot(train, aes_string(x = log_col)) +
    geom_histogram(bins = 40, fill = "darkorange", alpha = 0.7) +
    ggtitle(paste("Histogram:", log_col))
  
  p3 <- ggplot(train, aes_string(x = col, y = "log_clicks_per_view")) +
    geom_point(alpha = 0.4) +
    ggtitle(paste("log_clicks_per_view vs", col))
  
  p4 <- ggplot(train, aes_string(x = log_col, y = "log_clicks_per_view")) +
    geom_point(alpha = 0.4) +
    ggtitle(paste("log_clicks_per_view vs", log_col))
  
  list(p1 = p1, p2 = p2, p3 = p3, p4 = p4)
}

all_plots <- map(cols_with_log, plot_for_col)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(ggplot2)
library(stringr)
library(patchwork)
## Warning: pakiet 'patchwork' został zbudowany w wersji R 4.4.3
num_cols <- train %>%
  select(where(is.numeric)) %>%
  names()

cols_with_log <- num_cols[
  paste0("log_", num_cols) %in% names(train)
]

cols_with_log <- setdiff(cols_with_log, "log_clicks_per_view")


panel_for_col <- function(col) {
  
  log_col <- paste0("log_", col)
  
  cor_raw <- cor(train[[col]], train$log_clicks_per_view, use = "complete.obs")
  cor_log <- cor(train[[log_col]], train$log_clicks_per_view, use = "complete.obs")
  
  p1 <- ggplot(train, aes_string(x = col)) +
    geom_histogram(bins = 40, fill = "steelblue", alpha = 0.7) +
    ggtitle(paste("Histogram:", col))
  
  p2 <- ggplot(train, aes_string(x = log_col)) +
    geom_histogram(bins = 40, fill = "darkorange", alpha = 0.7) +
    ggtitle(paste("Histogram:", log_col))
  
  p3 <- ggplot(train, aes_string(x = col, y = "log_clicks_per_view")) +
    geom_point(color = "steelblue", alpha = 0.1) +
    annotate("text", x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
             label = paste0("corr = ", round(cor_raw, 3)),
             size = 4, color = "black") +
    ggtitle(paste("log_clicks_per_view vs", col))
  
  p4 <- ggplot(train, aes_string(x = log_col, y = "log_clicks_per_view")) +
    geom_point(color = "darkorange", alpha = 0.1) +
    annotate("text", x = Inf, y = Inf, hjust = 1.1, vjust = 1.5,
             label = paste0("corr = ", round(cor_log, 3)),
             size = 4, color = "black") +
    ggtitle(paste("log_clicks_per_view vs", log_col))
  
  (p1 | p2) /
  (p3 | p4)
}



walk(cols_with_log, ~ print(panel_for_col(.x)))

cols <- c("log_clicks_per_view", "log_word_count", "num_images", "num_categories", "log_num_links_internal", "log_num_editors", "num_edits", "creation_date_timestamp", "log_links_per_word")


log_cols <- num_cols[str_starts(num_cols, "log_")] 
cols_with_log_version <- num_cols[ paste0("log_", num_cols) %in% names(train) ] 
cols_without_logs <- setdiff( num_cols, union(log_cols, cols_with_log_version) ) 

features <- union(cols, cols_without_logs)
features <- setdiff(features, c("clicks_in" ,"clicks_per_view","mo_page_views","clicks_out"))

features
##  [1] "log_clicks_per_view"                             
##  [2] "log_word_count"                                  
##  [3] "num_images"                                      
##  [4] "num_categories"                                  
##  [5] "log_num_links_internal"                          
##  [6] "log_num_editors"                                 
##  [7] "num_edits"                                       
##  [8] "creation_date_timestamp"                         
##  [9] "log_links_per_word"                              
## [10] "cat_mathematics_related_lists"                   
## [11] "cat_archaeological_artifacts"                    
## [12] "cat_number_theory"                               
## [13] "cat_quantum_mechanics"                           
## [14] "cat_dynamical_systems"                           
## [15] "cat_20th_century_american_mathematicians"        
## [16] "cat_measurement"                                 
## [17] "cat_astrophysics"                                
## [18] "cat_21st_century_american_mathematicians"        
## [19] "cat_pseudohistory"                               
## [20] "cat_creators_of_writing_systems"                 
## [21] "cat_historiography"                              
## [22] "cat_historical_eras"                             
## [23] "cat_thermodynamics"                              
## [24] "cat_nuclear_physics"                             
## [25] "cat_death_conspiracy_theories"                   
## [26] "cat_topology"                                    
## [27] "cat_cryptography"                                
## [28] "cat_philosophers_of_history"                     
## [29] "cat_mathematical_logic"                          
## [30] "cat_condensed_matter_physics"                    
## [31] "cat_fellows_of_the_american_physical_society"    
## [32] "cat_physical_quantities"                         
## [33] "cat_living_people"                               
## [34] "cat_mechanics"                                   
## [35] "cat_destroyed_populated_places"                  
## [36] "cat_pseudohistorians"                            
## [37] "cat_fellows_of_the_american_mathematical_society"

Normalizacja danych

train <- select(train, features)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(features)
## 
##   # Now:
##   data %>% select(all_of(features))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
test  <- select(test,  features)

cols_to_scale <- setdiff(names(train), "log_clicks_per_view")

m <- sapply(train[cols_to_scale], mean)
s <- sapply(train[cols_to_scale], sd)

train[cols_to_scale] <- sweep(train[cols_to_scale], 2, m, "-")
train[cols_to_scale] <- sweep(train[cols_to_scale], 2, s, "/")

test[cols_to_scale] <- sweep(test[cols_to_scale], 2, m, "-")
test[cols_to_scale] <- sweep(test[cols_to_scale], 2, s, "/")

Clicks per view: 0 vs. >0

table(train$log_clicks_per_view > 0)
## 
## FALSE  TRUE 
## 10439 12463
table(test$log_clicks_per_view > 0)
## 
## FALSE  TRUE 
##  2591  3134

Klasy są zbalansowane i podobnie rozłożone w zbiorze treningowym i testowym:)

Cross-validation liczby sąsiadów

library(FNN)
## Warning: pakiet 'FNN' został zbudowany w wersji R 4.4.3
y <- ifelse(train$log_clicks_per_view > 0, 1, 0)
X <- train %>% select(-log_clicks_per_view)

set.seed(123)
K_folds <- 5
folds <- sample(rep(1:K_folds, length.out = nrow(X)))

k_values <- seq(1, 19, by = 2)
cv_results <- numeric(length(k_values))

for (i in seq_along(k_values)) {
  k <- k_values[i]
  acc_vec <- numeric(K_folds)
  
  for (fold in 1:K_folds) {
    train_idx <- which(folds != fold)
    test_idx  <- which(folds == fold)
    
    X_tr <- X[train_idx, ]
    X_te <- X[test_idx, ]
    y_tr <- y[train_idx]
    y_te <- y[test_idx]
    
    pred <- knn(
      train = X_tr,
      test  = X_te,
      cl    = y_tr,
      k     = k
    )
    
    acc_vec[fold] <- mean(pred == y_te)
  }
  
  cv_results[i] <- mean(acc_vec)
}

cv_table <- data.frame(
  k = k_values,
  accuracy = cv_results
)

cv_table
##     k  accuracy
## 1   1 0.7247402
## 2   3 0.7529036
## 3   5 0.7605452
## 4   7 0.7671821
## 5   9 0.7713303
## 6  11 0.7738630
## 7  13 0.7742119
## 8  15 0.7756091
## 9  17 0.7746922
## 10 19 0.7751289
library(plotly)
## Warning: pakiet 'plotly' został zbudowany w wersji R 4.4.3
## 
## Dołączanie pakietu: 'plotly'
## Następujący obiekt został zakryty z 'package:ggplot2':
## 
##     last_plot
## Następujący obiekt został zakryty z 'package:stats':
## 
##     filter
## Następujący obiekt został zakryty z 'package:graphics':
## 
##     layout
plot_ly(
  data = cv_table,
  x = ~k,
  y = ~accuracy,
  type = "scatter",
  mode = "lines+markers"
) %>%
  layout(
    title = "Cross Validation for k - klasyfikacja KNN",
    xaxis = list(title = "k"),
    yaxis = list(title = "Accuracy")
  )

Test modelu dla wybranego k

y_train <- ifelse(train$log_clicks_per_view > 0, 1, 0)
y_test  <- ifelse(test$log_clicks_per_view > 0, 1, 0)

X_train <- train %>% select(-log_clicks_per_view)
X_test  <- test  %>% select(-log_clicks_per_view)

set.seed(123)
k <- 11

pred <- knn(
  train = X_train,
  test  = X_test,
  cl    = y_train,
  k     = k
)

tab <- table(Predicted = pred, Actual = y_test)
tab
##          Actual
## Predicted    0    1
##         0 2017  747
##         1  574 2387
accuracy <- mean(pred == y_test)
accuracy
## [1] 0.7692576

Regresja KNN

y <- train$log_clicks_per_view
X <- train %>% select(-log_clicks_per_view)

set.seed(123)
K_folds <- 5
folds <- sample(rep(1:K_folds, length.out = nrow(X)))

k_values <- seq(5, 25, by = 2)
cv_rmse <- numeric(length(k_values))

for (i in seq_along(k_values)) {
  k <- k_values[i]
  rmse_vec <- numeric(K_folds)
  
  for (fold in 1:K_folds) {
    train_idx <- which(folds != fold)
    test_idx  <- which(folds == fold)
    
    X_tr <- X[train_idx, ]
    X_te <- X[test_idx, ]
    y_tr <- y[train_idx]
    y_te <- y[test_idx]
    
    pred <- knn.reg(
      train = X_tr,
      test  = X_te,
      y     = y_tr,
      k     = k
    )$pred
    
    rmse_vec[fold] <- sqrt(mean((pred - y_te)^2))
  }
  
  cv_rmse[i] <- mean(rmse_vec)
}

cv_reg_table <- data.frame(
  k = k_values,
  RMSE = cv_rmse
)

cv_reg_table
##     k       RMSE
## 1   5 0.08998771
## 2   7 0.08819632
## 3   9 0.08740525
## 4  11 0.08682771
## 5  13 0.08642395
## 6  15 0.08611523
## 7  17 0.08585791
## 8  19 0.08581224
## 9  21 0.08576875
## 10 23 0.08575809
## 11 25 0.08574710
plot_ly(
  data = cv_reg_table,
  x = ~k,
  y = ~RMSE,
  type = "scatter",
  mode = "lines+markers"
) %>%
  layout(
    title = "Cross Validation for k - regresja KNN",
    xaxis = list(title = "k"),
    yaxis = list(title = "RMSE")
  )
y_train <- train$log_clicks_per_view
y_test  <- test$log_clicks_per_view

X_train <- train %>% select(-log_clicks_per_view)
X_test  <- test  %>% select(-log_clicks_per_view)

k <- 25   

pred <- knn.reg(
  train = X_train,
  test  = X_test,
  y     = y_train,
  k     = k
)$pred

rmse <- sqrt(mean((pred - y_test)^2))
mae  <- mean(abs(pred - y_test))

ss_res <- sum((y_test - pred)^2)
ss_tot <- sum((y_test - mean(y_test))^2)

r2 <- 1 - ss_res/ss_tot


rmse
## [1] 0.08629388
mae
## [1] 0.05619395
r2
## [1] 0.3141905
pred_train <- knn.reg(
  train = X_train,
  test  = X_train,
  y     = y_train,
  k     = k
)$pred
pred_test <- pred

df_plot <- rbind(
  data.frame(
    set = "train",
    actual = y_train,
    pred = pred_train
  ),
  data.frame(
    set = "test",
    actual = y_test,
    pred = pred_test
  )
)

ggplot(df_plot, aes(x = actual, y = pred, color = set, shape = set)) +
  geom_point(alpha = 0.2, size = 2) +
  scale_color_manual(values = c("train" = "blue", "test" = "green4")) +
  scale_shape_manual(values = c("train" = 15, "test" = 17)) + 
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  labs(
    title = "Predicted vs Actual (train & test)",
    x = "Actual",
    y = "Predicted"
  ) +
  theme_minimal()

Combined approach

  1. regression for obserwations with clicks per view > 0
train_pos <- train %>% filter(log_clicks_per_view > 0)
test_pos  <- test  %>% filter(log_clicks_per_view > 0)

y_train <- train_pos$log_clicks_per_view
y_test  <- test_pos$log_clicks_per_view

X_train <- train_pos %>% select(-log_clicks_per_view)
X_test  <- test_pos  %>% select(-log_clicks_per_view)

k <- 25

pred <- knn.reg(
  train = X_train,
  test  = X_test,
  y     = y_train,
  k     = k
)$pred

rmse <- sqrt(mean((pred - y_test)^2))
mae  <- mean(abs(pred - y_test))

ss_res <- sum((y_test - pred)^2)
ss_tot <- sum((y_test - mean(y_test))^2)

r2 <- 1 - ss_res/ss_tot
rmse
## [1] 0.1003523
mae
## [1] 0.07223938
r2
## [1] 0.1579923
  1. Two-step KNN:
  • klasyfikacja, czy click per view jest dodatni

  • regresja dla wierszy sklasyfikowanych jako dodatnie

y_train <- train$log_clicks_per_view
y_test  <- test$log_clicks_per_view

X_train <- train %>% select(-log_clicks_per_view)
X_test  <- test  %>% select(-log_clicks_per_view)


y_train_bin <- ifelse(y_train > 0, 1, 0)
y_test_bin  <- ifelse(y_test  > 0, 1, 0)

k_class <- 11   
k_reg   <- 25   


pred_class <- knn(
  train = X_train,
  test  = X_test,
  cl    = y_train_bin,
  k     = k_class
)


pred_reg <- rep(0, length(pred_class))   

idx_pos <- which(pred_class == 1) 

if (length(idx_pos) > 0) {
  pred_reg[idx_pos] <- knn.reg(
    train = X_train,
    test  = X_test[idx_pos, ],
    y     = y_train,
    k     = k_reg
  )$pred
}

final_pred <- pred_reg

rmse <- sqrt(mean((final_pred - y_test)^2))
mae  <- mean(abs(final_pred - y_test))

ss_res <- sum((y_test - final_pred)^2)
ss_tot <- sum((y_test - mean(y_test))^2)
r2 <- 1 - ss_res/ss_tot

rmse
## [1] 0.0888856
mae
## [1] 0.05124505
r2
## [1] 0.2723771

Wizualizacja

pred_class_train <- knn(
  train = X_train,
  test  = X_train,
  cl    = ifelse(y_train > 0, 1, 0),
  k     = k_class
)

final_pred_train <- rep(0, length(pred_class_train))
idx_pos_train <- which(pred_class_train == 1)

if (length(idx_pos_train) > 0) {
  final_pred_train[idx_pos_train] <- knn.reg(
    train = X_train,
    test  = X_train[idx_pos_train, ],
    y     = y_train,
    k     = k_reg
  )$pred
}

pred_class_test <- knn(
  train = X_train,
  test  = X_test,
  cl    = ifelse(y_train > 0, 1, 0),
  k     = k_class
)


final_pred_test <- rep(0, length(pred_class_test))
idx_pos_test <- which(pred_class_test == 1)

if (length(idx_pos_test) > 0) {
  final_pred_test[idx_pos_test] <- knn.reg(
    train = X_train,
    test  = X_test[idx_pos_test, ],
    y     = y_train,
    k     = k_reg
  )$pred
}
df_plot <- rbind(
  data.frame(
    set = "train",
    actual = y_train,
    pred = final_pred_train
  ),
  data.frame(
    set = "test",
    actual = y_test,
    pred = final_pred_test
  )
)

ggplot(df_plot, aes(x = actual, y = pred, color = set, shape = set)) +
  geom_point(alpha = 0.2, size = 2) +
  scale_color_manual(values = c("train" = "blue", "test" = "green4")) +
  scale_shape_manual(values = c("train" = 15, "test" = 17)) + 
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  labs(
    title = "Predicted vs Actual (train & test) — 2-step kNN model",
    x = "Actual",
    y = "Predicted"
  ) +
  theme_minimal()

Wniosek: proste podejście dawało bardzo podobne - jeśli nie lepsze - rezultaty.